# Identify values causing the issueproblematic_values <- properties$number_of_rooms[is.na(as.numeric(properties$number_of_rooms))]#> Warning: NAs introduced by coercion# Replace non-numeric values with NAproperties$number_of_rooms <-as.numeric(gsub("[^0-9.]", "", properties$number_of_rooms))# Remove non-numeric characters and convert to numericproperties$price <-as.numeric(gsub("[^0-9]", "", properties$price))# Subset the dataset to exclude rows with price < 20000properties <- properties[properties$price >=20000, ]# Subset the dataset to exclude rows with numbers of rooms < 25properties <- properties[properties$number_of_rooms <25, ]# Replace incomplete addressesproperties$address <-gsub("^\\W*[.,0-]\\W*", "", properties$address)properties_filtered <-na.omit(properties)properties_filtered$year_category <-substr(properties_filtered$year_category, 1, 9)# Assuming 'year_category' is a column in the 'properties' datasetproperties_filtered$year_category <-as.factor(properties_filtered$year_category)# Preprocess the number_of_rooms columnproperties_filtered$number_of_rooms <-as.character(properties_filtered$number_of_rooms)properties_filtered$number_of_rooms <-gsub("\\D", "", properties_filtered$number_of_rooms) # Remove non-numeric charactersproperties_filtered$number_of_rooms <-as.numeric(properties_filtered$number_of_rooms) # Convert to numericproperties_filtered$number_of_rooms <-trunc(properties_filtered$number_of_rooms) # Truncate non-integer values# remove m^2 from column 'square_meters'properties_filtered$square_meters <-as.numeric(gsub("\\D", "", properties_filtered$square_meters))# print how many NA observations left in square_metersprint(sum(is.na(properties_filtered$square_meters)))#> [1] 988# remove NAproperties_filtered <- properties_filtered[!is.na(properties_filtered$square_meters),]# add majuscule to cantonproperties_filtered$canton <- tools::toTitleCase(properties_filtered$canton)# show 100 first row of cleaned dataset using reactablereactable(head(properties_filtered, 100))
The dataset described is the “Official Index of Localities” (Répertoire officiel des localités) provided by the Swiss Federal Office of Topography (swisstopo). It contains comprehensive information about all localities in Switzerland and the Principality of Liechtenstein, including their names, postal codes, and perimeters.
This dataset is regularly updated on a monthly basis, incorporating changes reported by cantonal authorities and Swiss Post. It serves various purposes, including spatial analysis, integration with other geographic datasets, usage as a geolocated background in GIS (Geographic Information Systems) and CAD (Computer-Aided Design) systems, statistical analysis, and as a reference dataset for information systems.
Updates and release notes for the dataset are provided periodically, detailing changes and improvements made over time. The Swiss Federal Office of Topography manages and distributes this dataset as part of its responsibilities in collecting and providing official geospatial data for Switzerland.
2.1.3.1 Creating Variable zip_code and merging with AMTOVZ_CSV_LV95
Code
df <- properties_filtered#the address column is like : '1844 Villeneuve VD' and has zip code number in it#taking out the zip code number and creating a new column 'zip_code'#the way to identify the zip code is to identify numbers that are 4 digits longdf$zip_code <-as.numeric(gsub("\\D", "", df$address))#removing the first two number of zip code has more than 4 numberdf$zip_code <-ifelse(df$zip_code >9999, df$zip_code %%10000, df$zip_code)
2.1.3.2 Using AMTOVZ_CSV_LV95 to get the city and canton from the zip code
Code
#read .csv AMTOVZ_CSV_LV95amto <-read.csv(file.path(here(),"data/AMTOVZ_CSV_WGS84.csv"), sep =";")#creating a new dataframe with 'Ortschaftsname' as 'City'Place_name', 'PLZ' as 'zip_code', 'KantonskÃ.rzel' as 'Canton_code', 'E' as 'lon' and 'N' as 'lat'amto_df <- amto[, c('Gemeindename', 'PLZ', 'Kantonskürzel', 'E', 'N')]#renaming the columnscolnames(amto_df) <-c('Community', 'zip_code', 'Canton_code', 'lon', 'lat')#remove duplicates of zip codeamto_df <- amto_df[!duplicated(amto_df$zip_code),]#add the variable of amto_df to the df if the zip code matchesdf <-merge(df, amto_df, by ="zip_code", all.x =TRUE)#check if there are nan in citydf[is.na(df$Community),]#> zip_code price number_of_rooms square_meters#> 1 25 2200000 65 165#> 2 25 2200000 10 263#> 3 26 655000 35 66#> 4 26 1995000 75 180#> 5 322 870000 25 59#> 6 322 880000 25 55#> 7 322 975000 35 56#> 230 1014 1510000 55 146#> 1137 1200 16092000 7 400#> 1138 1200 3285450 5 230#> 1139 1200 679000 55 142#> 5478 1919 785000 35 103#> 5479 1919 1908000 65 210#> 5480 1919 2558620 55 270#> 5481 1919 1065000 45 130#> 7621 2500 1100000 5 154#> 7622 2500 420000 45 115#> 7623 2500 885500 55 130#> 7624 2500 872500 45 144#> 7625 2500 872500 45 138#> 7626 2500 887500 55 130#> 7627 2500 870500 45 125#> 7628 2500 892500 45 144#> 7629 2500 885500 55 130#> 7630 2500 1050000 45 121#> 7631 2500 877500 45 138#> 7632 2500 887500 45 144#> 7633 2500 1450000 55 198#> 8325 3000 820000 55 165#> 8326 3000 1140000 35 115#> 8327 3000 1090000 35 115#> 8328 3000 1090000 55 193#> 8329 3000 920000 45 157#> 8330 3000 1090000 55 193#> 8331 3000 1590000 55 330#> 8332 3000 720000 35 102#> 8333 3000 920000 45 157#> 10434 4000 180000 3 70#> 10435 4000 975000 45 125#> 10436 4000 2100000 65 360#> 12358 5201 725000 35 95#> 13212 6000 695000 45 133#> 13964 6511 440000 2 64#> 14240 6547 15000000 75 220#> 14558 6602 2800000 75 242#> 14559 6602 2800000 65 250#> 14560 6602 270000 15 28#> 14561 6602 450000 35 75#> 14562 6604 1990000 45 220#> 14563 6604 2668590 55 290#> 14564 6604 760000 35 78#> 16575 6901 3660930 45 290#> 16576 6901 3660930 45 290#> 16577 6903 790000 35 105#> 16578 6907 995000 45 114#> 16579 6907 995000 45 114#> 16580 6911 469350 55 140#> 16581 6911 610000 35 103#> 16582 6911 660000 75 200#> 16583 6911 737550 45 82#> 17892 7133 2266290 55 160#> 17901 7135 2690000 85 236#> 18161 8000 2100000 45 152#> 18162 8000 1650000 45 142#> 18163 8000 925000 35 102#> 18164 8000 1650000 45 142#> 18165 8000 1150000 45 128#> 18166 8000 1450000 55 143#> 18167 8000 1990000 55 200#> 18168 8000 1990000 55 200#> 18169 8000 975000 45 122#> 18170 8000 2495000 55 482#> 18650 8238 245000 2 49#> 19074 8423 2110000 65 204#> 19075 8423 2190000 55 167#> 20288 9241 545000 45 100#> 20289 9241 730840 55 130#> address#> 1 1000 Lausanne 25#> 2 1000 Lausanne 25#> 3 1000 Lausanne 26#> 4 Lausanne 26, 1000 Lausanne 26#> 5 Via Cuolm Liung 30d, 7032 Laax GR 2#> 6 7032 Laax GR 2#> 7 Via Murschetg 29, 7032 Laax GR 2#> 230 1014 Lausanne#> 1137 1200 Genève#> 1138 1200 Genève#> 1139 Chemin des pralets, 74100 Etrembières, 1200 Genève#> 5478 1919 Martigny#> 5479 1919 Martigny#> 5480 1919 Martigny#> 5481 1919 Martigny#> 7621 2500 Biel/Bienne#> 7622 2500 Biel/Bienne#> 7623 2500 Biel/Bienne#> 7624 2500 Biel/Bienne#> 7625 2500 Biel/Bienne#> 7626 2500 Biel/Bienne#> 7627 2500 Biel/Bienne#> 7628 2500 Biel/Bienne#> 7629 2500 Biel/Bienne#> 7630 Hohlenweg 11b, 2500 Biel/Bienne#> 7631 2500 Biel/Bienne#> 7632 2500 Biel/Bienne#> 7633 2500 Bienne#> 8325 3000 Bern#> 8326 3000 Bern#> 8327 3000 Bern#> 8328 3000 Bern#> 8329 3000 Bern#> 8330 3000 Bern#> 8331 3000 Bern#> 8332 3000 Bern#> 8333 3000 Bern#> 10434 Lörrach Brombach Steinsack 6, 4000 Basel#> 10435 4000 Basel#> 10436 4000 Basel#> 12358 5201 Brugg AG#> 13212 in TRIENGEN, ca. 20 min. bei Luzern, 6000 Luzern#> 13964 6511 Cadenazzo#> 14240 Augio 1F, 6547 Augio#> 14558 6602 Muralto#> 14559 6602 Muralto#> 14560 6602 Muralto#> 14561 Via Bacilieri 2, 6602 Muralto#> 14562 6604 Solduno#> 14563 6604 Solduno#> 14564 6604 Locarno#> 16575 6901 Lugano#> 16576 6901 Lugano#> 16577 6903 Lugano#> 16578 6907 MASSAGNO#> 16579 6907 MASSAGNO#> 16580 6911 Campione d'Italia#> 16581 6911 Campione d'Italia#> 16582 6911 Campione d'Italia#> 16583 6911 Campione d'Italia#> 17892 Inder Platenga 34, 7133 Obersaxen#> 17901 7135 Fideris#> 18161 8000 Zürich#> 18162 8000 Zürich#> 18163 8000 Zürich#> 18164 8000 Zürich#> 18165 8000 Zürich#> 18166 8000 Zürich#> 18167 8000 Zürich#> 18168 8000 Zürich#> 18169 8000 Zürich#> 18170 8000 Zürich#> 18650 Stemmerstrasse 14, 8238 Büsingen am Hochrhein#> 19074 Chüngstrasse 60, 8423 Embrach#> 19075 Chüngstrasse 48, 8423 Embrach#> 20288 9241 Kradolf#> 20289 9241 Kradolf#> canton property_type floor year_category Community#> 1 Vaud Villa 2006-2010 <NA>#> 2 Vaud Single house 1919-1945 <NA>#> 3 Vaud Apartment noteg 2016-2024 <NA>#> 4 Vaud Villa 1961-1970 <NA>#> 5 Grisons Apartment eg 2016-2024 <NA>#> 6 Grisons Apartment noteg 2016-2024 <NA>#> 7 Grisons Apartment noteg 2011-2015 <NA>#> 230 Vaud Apartment eg 2011-2015 <NA>#> 1137 Geneva Single house 2011-2015 <NA>#> 1138 Geneva Bifamiliar house 1981-1990 <NA>#> 1139 Geneva Bifamiliar house 2016-2024 <NA>#> 5478 Valais Apartment noteg 2016-2024 <NA>#> 5479 Valais Apartment noteg 2016-2024 <NA>#> 5480 Valais Attic flat noteg 2016-2024 <NA>#> 5481 Valais Apartment noteg 2016-2024 <NA>#> 7621 Bern Single house 2001-2005 <NA>#> 7622 Bern Apartment noteg 1971-1980 <NA>#> 7623 Bern Villa 2016-2024 <NA>#> 7624 Bern Villa 2016-2024 <NA>#> 7625 Bern Single house 2016-2024 <NA>#> 7626 Bern Single house 2016-2024 <NA>#> 7627 Bern Single house 2016-2024 <NA>#> 7628 Bern Single house 2016-2024 <NA>#> 7629 Bern Single house 2016-2024 <NA>#> 7630 Bern Single house 2001-2005 <NA>#> 7631 Bern Single house 2016-2024 <NA>#> 7632 Bern Single house 2016-2024 <NA>#> 7633 Bern Single house 2016-2024 <NA>#> 8325 Bern Apartment noteg 2016-2024 <NA>#> 8326 Bern Apartment eg 2016-2024 <NA>#> 8327 Bern Apartment eg 2016-2024 <NA>#> 8328 Bern Roof flat noteg 2016-2024 <NA>#> 8329 Bern Apartment noteg 2016-2024 <NA>#> 8330 Bern Apartment noteg 2016-2024 <NA>#> 8331 Bern Apartment noteg 1991-2000 <NA>#> 8332 Bern Apartment eg 2016-2024 <NA>#> 8333 Bern Duplex noteg 2016-2024 <NA>#> 10434 Basel-Stadt Single house 1961-1970 <NA>#> 10435 Basel-Stadt Single house 2016-2024 <NA>#> 10436 Basel-Stadt Villa 2016-2024 <NA>#> 12358 Aargau Apartment noteg 2016-2024 <NA>#> 13212 Lucerne Apartment noteg 1991-2000 <NA>#> 13964 Ticino Apartment noteg 2016-2024 <NA>#> 14240 Grisons Single house 2016-2024 <NA>#> 14558 Ticino Single house 1981-1990 <NA>#> 14559 Ticino Single house 1981-1990 <NA>#> 14560 Ticino Apartment eg 1961-1970 <NA>#> 14561 Ticino Apartment noteg 1946-1960 <NA>#> 14562 Ticino Attic flat noteg 2011-2015 <NA>#> 14563 Ticino Apartment noteg 2011-2015 <NA>#> 14564 Ticino Apartment noteg 2011-2015 <NA>#> 16575 Ticino Attic flat noteg 2011-2015 <NA>#> 16576 Ticino Apartment noteg 2011-2015 <NA>#> 16577 Ticino Apartment noteg 2006-2010 <NA>#> 16578 Ticino Apartment noteg 2016-2024 <NA>#> 16579 Ticino Apartment noteg 2016-2024 <NA>#> 16580 Ticino Apartment noteg 1946-1960 <NA>#> 16581 Ticino Apartment eg 1946-1960 <NA>#> 16582 Ticino Single house 1971-1980 <NA>#> 16583 Ticino Apartment noteg 1991-2000 <NA>#> 17892 Grisons Single house 2006-2010 <NA>#> 17901 Grisons Single house 0-1919 <NA>#> 18161 Zurich Apartment noteg 2016-2024 <NA>#> 18162 Zurich Attic flat noteg 2016-2024 <NA>#> 18163 Zurich Apartment noteg 2016-2024 <NA>#> 18164 Zurich Apartment noteg 2016-2024 <NA>#> 18165 Zurich Apartment noteg 2016-2024 <NA>#> 18166 Zurich Apartment eg 2016-2024 <NA>#> 18167 Zurich Apartment noteg 2006-2010 <NA>#> 18168 Zurich Attic flat noteg 2006-2010 <NA>#> 18169 Zurich Single house 2016-2024 <NA>#> 18170 Zurich Apartment noteg 0-1919 <NA>#> 18650 Schaffhausen Apartment noteg 1961-1970 <NA>#> 19074 Zurich Bifamiliar house 2016-2024 <NA>#> 19075 Zurich Single house 2016-2024 <NA>#> 20288 Thurgau Apartment noteg 1991-2000 <NA>#> 20289 Thurgau Apartment noteg 1991-2000 <NA>#> Canton_code lon lat#> 1 <NA> NA NA#> 2 <NA> NA NA#> 3 <NA> NA NA#> 4 <NA> NA NA#> 5 <NA> NA NA#> 6 <NA> NA NA#> 7 <NA> NA NA#> 230 <NA> NA NA#> 1137 <NA> NA NA#> 1138 <NA> NA NA#> 1139 <NA> NA NA#> 5478 <NA> NA NA#> 5479 <NA> NA NA#> 5480 <NA> NA NA#> 5481 <NA> NA NA#> 7621 <NA> NA NA#> 7622 <NA> NA NA#> 7623 <NA> NA NA#> 7624 <NA> NA NA#> 7625 <NA> NA NA#> 7626 <NA> NA NA#> 7627 <NA> NA NA#> 7628 <NA> NA NA#> 7629 <NA> NA NA#> 7630 <NA> NA NA#> 7631 <NA> NA NA#> 7632 <NA> NA NA#> 7633 <NA> NA NA#> 8325 <NA> NA NA#> 8326 <NA> NA NA#> 8327 <NA> NA NA#> 8328 <NA> NA NA#> 8329 <NA> NA NA#> 8330 <NA> NA NA#> 8331 <NA> NA NA#> 8332 <NA> NA NA#> 8333 <NA> NA NA#> 10434 <NA> NA NA#> 10435 <NA> NA NA#> 10436 <NA> NA NA#> 12358 <NA> NA NA#> 13212 <NA> NA NA#> 13964 <NA> NA NA#> 14240 <NA> NA NA#> 14558 <NA> NA NA#> 14559 <NA> NA NA#> 14560 <NA> NA NA#> 14561 <NA> NA NA#> 14562 <NA> NA NA#> 14563 <NA> NA NA#> 14564 <NA> NA NA#> 16575 <NA> NA NA#> 16576 <NA> NA NA#> 16577 <NA> NA NA#> 16578 <NA> NA NA#> 16579 <NA> NA NA#> 16580 <NA> NA NA#> 16581 <NA> NA NA#> 16582 <NA> NA NA#> 16583 <NA> NA NA#> 17892 <NA> NA NA#> 17901 <NA> NA NA#> 18161 <NA> NA NA#> 18162 <NA> NA NA#> 18163 <NA> NA NA#> 18164 <NA> NA NA#> 18165 <NA> NA NA#> 18166 <NA> NA NA#> 18167 <NA> NA NA#> 18168 <NA> NA NA#> 18169 <NA> NA NA#> 18170 <NA> NA NA#> 18650 <NA> NA NA#> 19074 <NA> NA NA#> 19075 <NA> NA NA#> 20288 <NA> NA NA#> 20289 <NA> NA NA
We have 77 NAN, where
The zip code was not found in the atmo df
The zip code was incorectly isolated from the address
Removed them ::: {.cell layout-align=“center”}
Code
#remove the rows with nan in cityproperties_filtered <- df[!is.na(df$Community),]reactable(head(properties_filtered, 100))
# read csvimpots <-read.csv(file.path(here(),"data/estv_income_rates.csv"), sep =",", header =TRUE, stringsAsFactors =FALSE)# Remove 1st rowimpots <- impots[-1, ]# Remove 3rd columnimpots <- impots[, -3]# Combine text for columns 4-8impots[1, 4:8] <-"Impôt sur le revenu"# Combine text for columns 9-13impots[1, 9:13] <-"Impôt sur la fortune"# Combine text for columns 14-16impots[1, 14:16] <-"Impôt sur le bénéfice"# Combine text for columns 17-19impots[1, 17:19] <-"Impôt sur le capital"# Combine content of the first 2 rows into the 2nd rowimpots[2, ] <-apply(impots[1:2, ], 2, function(x) paste(ifelse(is.na(x[1]), x[2], ifelse(is.na(x[2]), x[1], paste(x[1], x[2], sep =" ")))))# Remove 1st rowimpots <- impots[-1, ]# Assign the text to the 1st row and 1st columnimpots[1, 1] <-"Coefficient d'impôt en %"# Replace column names with the content of the first rowcolnames(impots) <- impots[1, ]impots <- impots[-1, ]# Check for missing values in impotsany_missing <-any(is.na(impots))if (any_missing) {print("There are missing values in impots.")} else {print("There are no missing values in impots.")}#> [1] "There are no missing values in impots."# Replace row names with the content of the 3rd columnrow.names(impots) <- impots[, 3]impots <- impots[, -3]# Remove 2nd column (to avoid canton column)impots <- impots[, -2]# Remove impot egliseimpots <- impots[, -c(4:6)]impots <- impots[, -c(6:8)]impots <- impots[, -8]impots <- impots[, -10]# Clean data and convert to numericcleaned_impots <-apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))# Replace NA values with 0cleaned_impots[is.na(cleaned_impots)] <-0# Check for non-numeric valuesnon_numeric <-sum(!is.na(cleaned_impots) &!is.numeric(cleaned_impots))if (non_numeric >0) {print(paste("Warning: Found", non_numeric, "non-numeric values."))}rownames(cleaned_impots) <-rownames(impots)#reactable(head(cleaned_impots, 100))
2.1.5 Commune Data
2.1.5.1 Cleaning
ajouter source
ajouter description
expliquer blabla
Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%
For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0
Code
# il faudra changer le pathcommune_prep <-read.csv(file.path(here(),"data/commune_data.csv"), sep =";", header =TRUE, stringsAsFactors =FALSE)# We keep only 2019 to have some reference? (2020 is apparently not really complete)commune_2019 <-subset(commune_prep, PERIOD_REF =="2019") %>%select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))# delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonnecommune_2019 <-subset(commune_2019, STATUS =="A") %>%select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))# on enlève les lignes qui sont des aggrégatscommune_2019 <-subset(commune_2019, REGION !="Schweiz")commune_2019 <- commune_2019 %>%pivot_wider(names_from = INDICATORS, values_from = VALUE)# Rename columns using the provided mapcommune <- commune_2019 %>%rename(`Population - Habitants`= Ind_01_01,`Population - Densité de la population`= Ind_01_03,`Population - Etrangers`= Ind_01_08,`Population - Part du groupe d'âge 0-19 ans`= Ind_01_04,`Population - Part du groupe d'âge 20-64 ans`= Ind_01_05,`Population - Part du groupe d'âge 65+ ans`= Ind_01_06,`Population - Taux brut de nuptialité`= Ind_01_09,`Population - Taux brut de divortialité`= Ind_01_10,`Population - Taux brut de natalité`= Ind_01_11,`Population - Taux brut de mortalité`= Ind_01_12,`Population - Ménages privés`= Ind_01_13,`Population - Taille moyenne des ménages`= Ind_01_14,`Sécurité sociale - Taux d'aide sociale`= Ind_11_01,`Conseil national - PLR`= Ind_14_01,`Conseil national - PDC`= Ind_14_02,`Conseil national - PS`= Ind_14_03,`Conseil national - UDC`= Ind_14_04,`Conseil national - PEV/PCS`= Ind_14_05,`Conseil national - PVL`= Ind_14_06,`Conseil national - PBD`= Ind_14_07,`Conseil national - PST/Sol.`= Ind_14_08,`Conseil national - PES`= Ind_14_09,`Conseil national - Petits partis de droite`= Ind_14_10)# If no one voted for a party, set as NA -> replacing it with 0 insteadcommune <- commune %>%mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))# Removing NAs from Taux de couverture sociale column# Setting the mean as the mean for Switzerland in 2019 (3.2%)mean_taux_aide_social <-3.2# Replace NA values with the meancommune <- commune %>%mutate(`Sécurité sociale - Taux d'aide sociale`=if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))#show 100 first rows of commune using reactablereactable(head(commune, 100))
Code
# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)# # # We keep only 2019 to have some reference? (2020 is apparently not really complete)# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))# # # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne# commune_2019 <- subset(commune_2019, STATUS == "A") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))# # # on enlève les lignes qui sont des aggrégats# commune_2019 <- subset(commune_2019, REGION != "Schweiz")# # commune_2019 <- commune_2019 %>%# pivot_wider(names_from = INDICATORS, values_from = VALUE)# # # Rename columns using the provided map# commune <- commune_2019 %>%# rename(`Population - Habitants` = Ind_01_01,# `Population - Densité de la population` = Ind_01_03,# `Population - Etrangers` = Ind_01_08,# `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,# `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,# `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,# `Population - Taux brut de nuptialité` = Ind_01_09,# `Population - Taux brut de divortialité` = Ind_01_10,# `Population - Taux brut de natalité` = Ind_01_11,# `Population - Taux brut de mortalité` = Ind_01_12,# `Population - Ménages privés` = Ind_01_13,# `Population - Taille moyenne des ménages` = Ind_01_14,# `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,# `Conseil national - PLR` = Ind_14_01,# `Conseil national - PDC` = Ind_14_02,# `Conseil national - PS` = Ind_14_03,# `Conseil national - UDC` = Ind_14_04,# `Conseil national - PEV/PCS` = Ind_14_05,# `Conseil national - PVL` = Ind_14_06,# `Conseil national - PBD` = Ind_14_07,# `Conseil national - PST/Sol.` = Ind_14_08,# `Conseil national - PES` = Ind_14_09,# `Conseil national - Petits partis de droite` = Ind_14_10)# # # If no one voted for a party, set as NA -> replacing it with 0 instead# commune <- commune %>%# mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))# # # # Removing NAs from Taux de couverture sociale column# # Setting the mean as the mean for Switzerland in 2019 (3.2%)# mean_taux_aide_social <- 3.2# # # Replace NA values with the mean# commune <- commune %>%# mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))#
3 EDA
3.1 Map representation of distribution of properties
Code
# Create a leaflet map with optimized markersmap <-leaflet(properties_filtered) %>%addTiles() %>%# Add default OpenStreetMap tilesaddProviderTiles(providers$Esri.NatGeoWorldMap) %>%# Add topographic maps for contextaddCircleMarkers(~lon, ~lat,radius =1.5, # Smaller radius for the circle markerscolor ="#32012F", # Specifying a color for the markersfillOpacity =0.2, # Semi-transparent fillstroke =FALSE, # No border to the circle markers to reduce visual noisepopup =~paste("Price: ", price, "<br>","Rooms: ", number_of_rooms, "<br>","Type: ", property_type, "<br>","Year: ", year_category),label =~paste("Price: ", price) # Tooltip on hover ) %>%addLegend(position ="bottomright", # Position the legend at the bottom rightcolors ="#32012F", # Use the same color as the markerslabels ="Properties"# Label for the legend )map$width <-"100%"# Set the width of the map to 100%map$height <-600# Set the height of the map to 600 pixelsmap
3.2 Histogram of prices
Code
histogram_price <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="red") +labs(title ="Distribution of Prices",x ="Price",y ="Frequency") +theme_minimal()# Convert ggplot object to plotly objectinteractive_histogram_price <-ggplotly(histogram_price, width =600, height =400 )# Display the interactive histograminteractive_histogram_price
3.3 Histogram of prices for each property type
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create the ggplot objecthistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ property_type, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Property Type",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram
3.4 Histogram of prices for each year category
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create a histogram of prices for each year categoryhistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ year_category, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Year Category",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram_year <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram_year
3.5 Histogram of prices for each canton
note : only price between 0 and 500000 so some outliers aren’t here
Code
histogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ canton, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Canton",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram, width =600, height =1000) %>%layout(height =1000) # Adjust the height as needed# Display the interactive plotinteractive_histogram
3.6 Histogram of prices for each number of rooms
note : only price between 0 and 500000 so some outliers aren’t here
and the graph below only show apartments with less than 10 rooms (but you can change the code if needed
Code
properties_room <- properties_filtered[properties_filtered$number_of_rooms <20, ] # Filter only number_of_rooms less than 20# Create a histogram of prices for each number of roomshistogram <-ggplot(properties_room, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ number_of_rooms, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Number of Rooms",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram
3.7 Histogram of prices with impot
Code
# colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)] <- gsub("\\s+", "_", colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)])# # # Create a scatter plot to visualize correlation between price and Impôt cantonal# scatter_plot <- ggplot(properties_filtered, aes(x = price, y = Impôt_cantonal_impots)) +# geom_point() +# labs(title = "Correlation between Price and Impôt cantonal",# x = "Price",# y = "Impôt cantonal") +# theme_minimal()# # # Convert ggplot object to plotly object# interactive_plot <- ggplotly(scatter_plot)# # # Display the interactive plot# interactive_plot
Code
impot_cols <-names(properties_filtered)[startsWith(names(properties_filtered), "Impôt")]# Count the number of NA values in selected columnsna_counts <-colSums(is.na(properties_filtered[impot_cols]))# Print the countsprint(na_counts)#> numeric(0)
4 Unsupervised learning
Clustering and/or dimension reduction
Trying to Cluster commune datas to: 1. Reduce dimension 2. See similarities
A regarder, est-ce qu’on fait un cluster pour les datas politques + un cluster pour les data démographiques, ou est-ce qu’on regroupe tout?
Code
set.seed(123)# Clustering demographiccols_commune_demographic <-select(commune, -c("REGION", "CODE_REGION","Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))# Scale the columns, some are total numbers, some are percentagescols_commune_demographic <-scale(cols_commune_demographic)# Calculate the distance matrixdist_matrix_demographic <-dist(cols_commune_demographic, method ="minkowski")# Perform hierarchical clusteringhclust_model_demographic <-hclust(dist_matrix_demographic, method ="ward.D")# Create dendrogramdend_demo <-as.dendrogram(hclust_model_demographic)dend_demo <-color_branches(dend_demo, k =5) #Set number of cluster to 5, to keep the same scale for all our variablesplot(dend_demo, main ="Demographics - Hierarchical Clustering Dendrogram")
Code
# Clustering politicsset.seed(123)cols_commune_politics <-select(commune, c("Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))# Scale the columns, some are total numbers, some are percentagescols_commune_politics <-scale(cols_commune_politics)# Calculate the distance matrixdist_matrix_politics <-dist(cols_commune_politics, method ="minkowski")# Perform hierarchical clusteringhclust_model_politics <-hclust(dist_matrix_politics, method ="ward.D")# Create dendrogramdend_pol <-as.dendrogram(hclust_model_politics)dend_pol <-color_branches(dend_pol, k =5) #Set number of cluster to 5, to keep the same scale for all our variablesplot(dend_pol, main ="Politics - Hierarchical Clustering Dendrogram")
4.1 Tax
Code
set.seed(123)# Clean data and convert to numericcleaned_impots <-apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))cleaned_impots[is.na(cleaned_impots)] <-0# Replace NA values with 0# Scale the featuresscaled_impots <-scale(cleaned_impots)# Perform k-means clusteringk <-2# Initial guess for the number of clusterskmeans_model <-kmeans(scaled_impots, centers = k)# Check within-cluster sum of squares (elbow method)wss <-numeric(10)for (i in1:10) { kmeans_model <-kmeans(scaled_impots, centers = i) wss[i] <-sum(kmeans_model$withinss)}# plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")# Adjust k based on elbow methodk <-5# Perform k-means clustering again with optimal kkmeans_model <-kmeans(scaled_impots, centers = k)# Assign cluster labels to dendrogramclusters <- kmeans_model$cluster# Plot dendrogram#colored_dend <- color_branches(dend, k = 5)#y_zoom_range <- c(0, 80) # Adjust the y-axis range as needed#plot(colored_dend, main = "Hierarchical Clustering Dendrogram", horiz = FALSE, ylim = y_zoom_range)
# Preparing df_commune for merging with main datasetdf_commune <-select(commune, REGION)df_commune$Demographic_cluster <-cutree(hclust_model_demographic, k =5)df_commune$Political_cluster <-cutree(hclust_model_politics, k =5)# Preparing to mergemerging <-inner_join(amto_df, df_commune, by =c("Community"="REGION"))impots_cluster_subset <- impots_cluster[, c("Community", "cluster")]merging <- merging %>%left_join(impots_cluster_subset, by ="Community")clusters_df <- merging %>%rename(Tax_cluster = cluster) %>%rename(Commune = Community)clusters_df <- clusters_df %>%select(c("Commune", "zip_code", "Canton_code", "Demographic_cluster", "Political_cluster", "Tax_cluster"))# Only NAs are for commune Brugg, (written Brugg (AG) in the other data set) -> j'entre le cluster à la manoclusters_df$Tax_cluster[is.na(clusters_df$Tax_cluster)] <-2# adding it to our main data set:properties_filtered <-merge(properties_filtered, clusters_df[, c("zip_code", "Demographic_cluster", "Political_cluster", "Tax_cluster")], by ="zip_code", all.x =TRUE)na_count <-sum(is.na(properties_filtered[, c("Demographic_cluster", "Political_cluster", "Tax_cluster")]))# Print the resultif (na_count >0) {print("There are NA values in the merged dataframe.")print(na_count)} else {print("There are no NA values in the merged dataframe.")}#> [1] "There are NA values in the merged dataframe."#> [1] 678# Find rows with NA values in the specified columnsna_rows <-subset(properties_filtered, is.na(Demographic_cluster) |is.na(Political_cluster) |is.na(Tax_cluster))